# Imports
import sys
import numpy as np
import pandas as pd
import json
import random
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn import base, neural_network, neighbors, feature_extraction, datasets, linear_model, metrics, model_selection, preprocessing, svm, ensemble, decomposition, feature_extraction, utils
import time
# Configs
%config InlineBackend.figure_format ='retina'
%matplotlib inline
# Cython
%load_ext Cython
#!python
#cython: language_level=3
print("All libraries set-up and ready to go!")
suffix = ''
k = pd.read_csv('../../pads/keystrokes' + suffix + '.csv', index_col=False, encoding='latin1')
w = pd.read_csv('../../pads/words' + suffix + '.csv', index_col=False)
s = pd.read_csv('../../pads/sentences' + suffix + '.csv', index_col=False)
f = pd.read_csv('../../pads/files' + suffix + '.csv', index_col=False)
u = pd.read_csv('../../pads/users' + suffix + '.csv', index_col=False)
def find_mean(u_id):
return u.at[int(u_id), 't_mean_95']
def find_std(u_id):
return u.at[int(u_id), 't_std_95']
k['adj_t'] = (k['t'] - k['u_id'].apply(find_mean)) / k['u_id'].apply(find_std)
# Runs in approx. 50s
k_sorted = k.sort_values(by='s_id')
rs = 0; re = 0; ranges = {}
for idx, val in k_sorted['s_id'].value_counts().sort_index().iteritems():
re += val
ranges[idx] = (rs, re)
rs += val
for s_id, row in s.iterrows():
if ranges.get(s_id) != None:
t, v = ranges.get(s_id)
k_sentence = k_sorted.iloc[t:v, :]
else:
k_sentence = k_sorted.iloc[0:0, :]
###### YOUR CODE HERE #####
## Will run ranging over sentence with s_id, row, k_sentence ##
## Example: s.at[s_id, 'ks_len'] = len(k_sentence) / len(row['text'])
for u_id, row in u.iterrows():
k_user = k[k['u_id'] == u_id]
s_user = s[s['u_id'] == u_id]
u.loc[u_id, 'k_count'] = len(k_user)
pct = k_user['t'].quantile(0.95)
u.loc[u_id, 't_mean'] = k_user['t'].mean()
u.loc[u_id, 't_std'] = k_user['t'].std()
u.loc[u_id, 't_mean_10000'] = k_user['t'].apply(lambda x: min(x, 10000)).mean()
u.loc[u_id, 't_std_10000'] = k_user['t'].apply(lambda x: min(x, 10000)).std()
u.loc[u_id, 't_mean_95'] = k_user['t'].apply(lambda x: min(x, pct)).mean()
u.loc[u_id, 't_std_95'] = k_user['t'].apply(lambda x: min(x, pct)).std()
pct = s_user['text_len'].quantile(0.95)
u.loc[u_id, 'text_len_mean'] = s_user['text_len'].apply(lambda x: min(x, pct)).mean()
u.loc[u_id, 'text_len_std'] = s_user['text_len'].apply(lambda x: min(x, pct)).std()
pct = s_user['del/len'].quantile(0.95)
u.loc[u_id, 'del/len_mean'] = s_user['del/len'].apply(lambda x: min(x, pct)).mean()
u.loc[u_id, 'del/len_std'] = s_user['del/len'].apply(lambda x: min(x, pct)).std()
u.loc[u_id, 'revision_t/len_filter_10000_mean'] = s_user['revision_t_filter_10000'].sum() / s_user['text_len'].sum()
u.loc[u_id, 'separator_t/t_filter_10000_mean'] = s_user['separator_t/t_filter_10000'].mean()
u.loc[u_id, 'word_t/t_filter_10000_mean'] = s_user['word_t/t_filter_10000'].mean()
u.loc[u_id, 't/word_filter_10000_mean'] = s_user['t_filter_10000'].sum() / s_user['word_count'].sum()
for l in ['510', '1000', '2000', '5000', '10000']:
u.loc[u_id, 'p_'+l+'_no/len'] = s_user['p_'+l+'_no'].sum() / s_user['text_len'].sum()
for f_id, row in f.iterrows():
tx = ''
for s_id, st in s[s['f_id'] == f_id].iterrows():
#print (len(tx), st['start_pos'])
if len(tx) < st['start_pos']:
tx += (' ' * (st['start_pos'] - len(tx)))
tx += st['text'].replace('\n',' ')
f.at[f_id, 'text'] = tx
for w_id, row in w.iterrows():
if ranges.get(w_id) != None:
t, v = ranges.get(w_id)
k_word = k_sorted.iloc[t:v, :]
else:
k_word = k_sorted.iloc[0:0, :]
w.at[w_id, 'text_len'] = len(str(row['text']))
w.at[w_id, 'k_count'] = len(k_word)
w.at[w_id, 't'] = k_word['t'].sum()
w.at[w_id, 't/len'] = k_word['t'].sum() / w.at[w_id, 'text_len']
w.at[w_id, 't/len_filter_2000'] = k_word['t'].apply(lambda x: min(x, 2000)).sum() / w.at[w_id, 'text_len']
w.at[w_id, 't/len_filter_10000'] = k_word['t'].apply(lambda x: min(x, 10000)).sum() / w.at[w_id, 'text_len']
w.at[w_id, 'max_p'] = k_word['t'].max()
w.at[w_id, 'n_revisions'] = len(k_word[k_word['op'] == '-'])
s['text_len'].max()
ms = [('KolmogorovDeflate_value', 'n_Kolmogorov'),
('Syntactic.KolmogorovDeflate_value', 'n_SKolmogorow'),
('Morphological.KolmogorovDeflate_value', 'n_MKolmogorow'),
('Lexical.Diversity.NDW_value', 'n_NDW_value'),
('Lexical.Diversity.RTTR_value', 'n_RTTR_value')]
binsz = 5
rgs, rge = 0, 1215
for old_n, new_n in ms:
avgKolmogorov = [0] * int((rge - rgs) / binsz)
avgStd = [0] * int((rge - rgs) / binsz)
for i in range(0, int((rge - rgs) / binsz)):
b_idx = (s['text_len'] >= rgs + i * binsz) & (s['text_len'] < rgs + (i + 1) * binsz)
avgKolmogorov[i] = float(s[b_idx][old_n].mean())
avgStd[i] = float(s[b_idx][old_n].std())
for idx, row in s.iterrows():
try:
s.at[idx, new_n] = (row[old_n] - avgKolmogorov[int((row['text_len'] - rgs) / binsz)]) / avgStd[int((row['text_len'] - rgs) / binsz)]
except ZeroDivisionError:
s.at[idx, new_n] = 0
useful = ['Syntactic.ClausesPerSentence_value',
'Syntactic.VerbPhrasesPerSentence_value',
'Syntactic.ComplexNominalsPerSentence_value',
'NounPhrasePreModificationWords_value',
'NounPhrasePostModificationWords_value',
'Lexical.Sophistication.ANC_value',
'Lexical.Sophistication.BNC_value',
'Lexical.Density_value'] + \
[s for f, s in ms]
print(high_k.at[7, 'text'])
print(high_k.at[247, 'text'])
print(high_k.at[668, 'text'])
print(high_k.at[95088, 'text'])
print(high_k.at[94887, 'text'])
print(low_k.at[127, 'text'])
print(low_k.at[245, 'text'])
print(low_k.at[254, 'text'])
print(low_k.at[457, 'text'])
print(low_k.at[94661, 'text'])
These can also be divided into two examples; an Acceptability Judgement task and a Magnitude Estimation test.
Both learner corpora showed no significant difference, only the EXPERT texts stood out in comparison.
For example, Miss Kerz collected data on the bigram frequency index across our Learning Journals 1 to 4.
Example given, a group of children (age 10) shoud write a text about dogs to measure lexis.
Kachru´s model of English worldwude is divided int three circles: the Inner, Outer and Expanding Circle.
These four skills are writing skills, listening skills, speaking skills and reading skills.
The proficiency was measured in complexity, the more complex the writing is, the more proficient the user.
There are three types of sampling, random sampling, representative sampling and convenience sampling.
The dependent, or response variables, are the influenced variables of the independent variable.
If we use more than one dependent or independent variable than the design is called a multivariate design.
def split_by_user_length(cond1, cond2, tol=1, df=s):
'''Execution time: approx. 13s
Splits a dataframe into two, so that they have approximately the same user writer and text_len distribution
cond1, cond2: functions row -> True/False
tol: how strict should we be
df: defaults into splitting sentences, can be changed at your risk'''
global k, w, s, f, u
s1 = []; s2 = []; len1 = 0; len2 = 0
df.loc[:, 'new_len'] = (df['text_len'] / tol).astype(int)
for i in range(0, len(u)):
u_df = df[df['u_id'] == i]
len_range = range(u_df['new_len'].min(), u_df['new_len'].max() + 1)
d1 = {k: [] for k in len_range}; d2 = {k: [] for k in len_range}
for idx, row in u_df.iterrows():
if cond1(row):
d1[row['new_len']].append(row['id'])
if cond2(row):
d2[row['new_len']].append(row['id'])
for l in len_range:
todo = min(len(d1[l]), len(d2[l]))
s1 += d1[l][0:todo]
s2 += d2[l][0:todo]
ids = df['id']
tv1 = df['id'] < 0; tv2 = df['id'] < 0 # all False
s1p = {i: 1 for i in s1}; s2p = {i: 1 for i in s2}
for idx, val in ids.iteritems():
if val in s1p:
tv1[idx] = True
if val in s2p:
tv2[idx] = True
print("Returning dataframes of len ", len(df[tv1]))
return df[tv1], df[tv2]
def split_by_length(cond1, cond2, tol=1, df=s):
global k, w, s, f, u
s1 = []; s2 = []; len1 = 0; len2 = 0
df.loc[:, 'new_len'] = (df['text_len'] / tol).astype(int)
len_range = range(df['new_len'].min(), df['new_len'].max() + 1)
d1 = {k: [] for k in len_range}; d2 = {k: [] for k in len_range}
for idx, row in df.iterrows():
if cond1(row):
d1[row['new_len']].append(row['id'])
if cond2(row):
d2[row['new_len']].append(row['id'])
for l in len_range:
todo = min(len(d1[l]), len(d2[l]))
s1 += d1[l][0:todo]
s2 += d2[l][0:todo]
ids = df['id']
tv1 = df['id'] < 0; tv2 = df['id'] < 0 # all False
s1p = {i: 1 for i in s1}; s2p = {i: 1 for i in s2}
for idx, val in ids.iteritems():
if val in s1p:
tv1[idx] = True
if val in s2p:
tv2[idx] = True
print("Returning dataframes of len ", len(df[tv1]))
return df[tv1], df[tv2]
def plot_distr(ys, cr1=None, cr2=None, split_cr=None, pct=0.2, \
by_user=False, by_length=False, tol=1, df=s, p_only=False):
'''Execution time: approx. 15s
Splits a dataframe into two, according to criteria given by
cr1, cr2: functions :: row -> [True, False]
split_cr, pct=0.2 :: str -> if df[str] is abv/blw pct percentile
by_user (dft False): normalize by user (criteria is user-sensitive)
by_length (dft True): normalize by text_len
Plots the distributions in variables given as ys
sample usage: plot_distr(['t', 't/len'], cr1=isSingleCl, cr2=isMultiCl)
plot_distr('t', split_cr='k_count', by_user=True, by_length=True)
'''
global k, w, s, f, u, valid_u
if (cr1 is not None) and (cr2 is not None):
criteria1, criteria2 = cr1, cr2
l1, l2 = cr1.__name__, cr2.__name__
else:
if split_cr is not None:
high_q = df[split_cr].quantile(1 - pct)
low_q = df[split_cr].quantile(pct)
def high_var(row):
return (row[split_cr] >= high_q)
def low_var(row):
return (row[split_cr] <= low_q)
criteria1, criteria2 = high_var, low_var
l1, l2 = 'high '+split_cr, 'low '+split_cr
else:
raise "NoCriteria"
if by_user == True:
if by_length == False:
tol = 15000
df_1, df_2 = split_by_user_length(criteria1, criteria2, tol=tol, df=df)
else:
if by_length == True:
df_1, df_2 = split_by_length(criteria1, criteria2, tol=tol, df=df)
else:
tv1 = df['id'] < 0; tv2 = df['id'] < 0 # all False
for idx, row in df.iterrows():
if criteria1(row):
tv1[idx] = True
if criteria2(row):
tv2[idx] = True
df_1, df_2 = df[tv1], df[tv2]
if not isinstance(ys, list):
ys = [ys]
res = []
for v in ys:
if p_only:
res.append(get_p(df_1[v], df_2[v]))
else:
plot_distributions(df_1[v], df_2[v], label1=l1, label2=l2, x_label=v)
return res
def get_p(data1, data2):
if len(data1) > len(data2):
data1 = data1[:len(data2)]
if len(data2) > len(data1):
data2 = data2[:len(data1)]
mdiff = abs(data1.mean() - data2.mean())
avgstd = (data1.std() + data2.std()) / 2
pvalue = scipy.stats.norm.sf(abs(mdiff/avgstd * np.sqrt(len(data1))))
return pvalue
def plot_distributions(data1, data2, label1='data1', label2='data2', x_label='variable'):
if len(data1) > len(data2):
data1 = data1[:len(data2)]
if len(data2) > len(data1):
data2 = data2[:len(data1)]
m = min(data1.quantile(0.01), data2.quantile(0.01))
M = max(data1.quantile(0.99), data2.quantile(0.99))
bins = np.linspace(m, M, 100)
fig, axs = plt.subplots(2, gridspec_kw={'height_ratios': [1, 4]})
fig.set_size_inches(12, 4)
#plt.hold = True
#boxes=[singleClause, multiClause]
#axs[0].boxplot(boxes,vert=0)
#axs[0].set(xlim=(m, M))
axs[0].set(xlim=(m, M))
my_pal = {0: "g", 1: "r"}
sns.boxplot(data=[data1, data2], orient='h' , ax=axs[0], palette=my_pal, whis=[2.5, 97.5])
#axs[1].hist(data1, bins, alpha=0.4, label=label1, color='g')
#axs[1].hist(data2, bins, alpha=0.3, label=label2, color='r')
sns.distplot(data1, hist = False, kde = True,
kde_kws = {'linewidth': 2, 'shade': True, 'color': 'g', 'alpha': 0.2},
label = label1)
sns.distplot(data2, hist = False, kde = True,
kde_kws = {'linewidth': 2, 'shade': True, 'color': 'r', 'alpha': 0.2},
label = label2)
axs[1].axvline(x=data1.mean(), c='g', lw=1)
axs[1].axvline(x=data2.mean(), c='r', lw=1)
axs[1].set(xlim=(m, M))
plt.legend(loc='upper right')
plt.xlabel(x_label, fontsize=18)
plt.show()
mdiff = abs(data1.mean() - data2.mean())
avgstd = (data1.std() + data2.std()) / 2
pvalue = scipy.stats.norm.sf(abs(mdiff/avgstd * np.sqrt(len(data1))))
print('Sample size: ', len(data1), \
' Mean diff: ', "{0:.4f}".format(mdiff), \
' stds: ', "{0:.4f}".format(mdiff/avgstd) ,end='')
if pvalue < 0.00001:
print(' p-value: ', '{:.5E}'.format(pvalue))
else:
print(' p-value: ', '{:.6f}'.format(pvalue))
def rand(row):
return random.choice([True, False, False, False])
def rand2(row):
return (row['id'] % 3) == 2
def rand3(row):
return (row['id'] % 3) == 1
def hexplot(l1, l2, df=s, gridsize=20):
global k, w, s, f, u
b_idx = (df[l1] < df[l1].quantile(0.97)) & (df[l2] < df[l2].quantile(0.97)) & (df[l1] > df[l1].quantile(0.03)) & (df[l2] > df[l2].quantile(0.03))
df[b_idx].plot.hexbin(x=l1, y=l2, gridsize=gridsize)
def plot_summary(data, bins=100, pct=0.01, logscale=None):
'''Execution time:
takes a pd.Series object and prints a nice plot with summary stats & dist
'''
m, M = data.quantile(pct), data.quantile(1-pct)
data = data[(data < M) & (data > m)]
fig, axs = plt.subplots(2, gridspec_kw={'height_ratios': [1, 5]})
fig.set_size_inches(6, 4)
sns.boxplot(data, orient='h' , ax=axs[0],
fliersize=0, whis=[2.5, 97.5])
sns.distplot(data, kde=False)
plt.axvline(x=data.mean(), c='b', lw=1.5)
if (logscale == None) and ((M / m) > 100):
logscale=1
if logscale == 1:
plt.yscale('log')
axs[0].set_xlim(axs[1].get_xlim())
#data.hist(bins=100, log=True)
#fig = plt.figure()
#fig.set_size_inches(8, 4)
#plt.hold = True
#boxes=[singleClause, multiClause]
#axs[0].boxplot(boxes,vert=0)
#axs[0].set(xlim=(m, M))
#axs[1].hist(data1, bins, alpha=0.4, label=label1, color='g')
#axs[1].hist(data2, bins, alpha=0.3, label=label2, color='r')
#sns.distplot(data, hist = False, kde = True,
# kde_kws = {'linewidth': 2, 'shade': True, 'color': 'b', 'alpha': 0.7})
#axs[1].axvline(x=data.mean(), c='b', lw=1.5)
#axs[1].axvline(x=data.median(), c='b', lw=0.5)
#axs[1].set(xlim=(m, M))
#plt.xlabel(x_label)
#plt.show()
def plot_fluency_latency(s1, s2):
fig = plt.figure()
ax1 = fig.add_subplot(111)
fig.set_size_inches(8, 8)
ax1.set(xlim=(-2.5, 2.5), ylim=(-2.5, 2.5))
ax1.scatter(s1['my_f'], s1['latency'],
s=10, c='b', marker="s", label='first', alpha=0.3)
ax1.scatter(s2['my_f'], s2['latency'],
s=10, c='r', marker="o", label='second', alpha=0.3)
plt.legend(loc='upper left');
plt.show()
Motivation: we would like to capture the speed of typing with user-independent measures
Problem 1: time is non linear, if you simply sum or take means large pauses have disproportionate impact
Solution: use $\log t$ instead of $t$
Problem 2: different users have different keystroke mashing skills
Solution: standardize by user so that his $t$s have mean 0 std 1. So $adjusted\_log\_t = \frac{\log t - mean(\log t_{user})}{\sigma_{\log t_{user}}}$
We define $fluency$ to be the sum of those $adjusted\_log\_t$, divided by $\sqrt{len}$ (so it still has mean 0, std 1)
We define $latency$ to be the weigthed sum of the longest pauses
Remember, high $fluency$ = high speed, high $latency$ = low speed
k['log_t'] = k['t'].apply(np.log1p)
k['log_t'].hist(bins=100)
for u_id, row in u.iterrows():
k_user = k[k['u_id'] == u_id]
u.loc[u_id, 'log_t_mean'] = k_user['log_t'].mean()
u.loc[u_id, 'log_t_std'] = k_user['log_t'].std()
def findMean(u_id):
return u.at[int(u_id), 't_mean_95']
def findStd(u_id):
return u.at[int(u_id), 't_std_95']
def findLogMean(u_id):
return u.at[int(u_id), 'log_t_mean']
def findLogStd(u_id):
return u.at[int(u_id), 'log_t_std']
k['adj_t'] = (k['t'] - k['u_id'].apply(findMean)) / k['u_id'].apply(findStd)
k['adj_log_t'] = (k['log_t'] - k['u_id'].apply(findLogMean)) / k['u_id'].apply(findLogStd)
# Runs in approx. 50s
k_sorted = k.sort_values(by='s_id')
rs = 0; re = 0; ranges = {}
for idx, val in k_sorted['s_id'].value_counts().sort_index().iteritems():
re += val
ranges[idx] = (rs, re)
rs += val
for s_id, row in s.iterrows():
if ranges.get(s_id) != None:
t, v = ranges.get(s_id)
k_sentence = k_sorted.iloc[t:v, :]
else:
k_sentence = k_sorted.iloc[0:0, :]
###### YOUR CODE HERE #####
## Will run ranging over sentence with s_id, row, k_sentence ##
## Example: s.at[s_id, 'ks_len'] = len(k_sentence) / len(row['text'])
s.at[s_id, 'adj_t/len'] = k_sentence['adj_t'].sum() / row['text_len']
s.at[s_id, 'adj_t/sqrt_len'] = k_sentence['adj_t'].sum() / np.sqrt(row['text_len'])
s.at[s_id, 'adj_log_t/len'] = k_sentence['adj_log_t'].sum() / row['text_len']
s.at[s_id, 'adj_log_t/sqrt_len'] = k_sentence['adj_log_t'].sum() / np.sqrt(row['text_len'])
s.at[s_id, 'adj_log_t/sqrt_k'] = k_sentence['adj_log_t'].sum() / np.sqrt(row['text_len'])
mn = s['adj_log_t/sqrt_len'].mean()
std = s['adj_log_t/sqrt_len'].std()
s['fluency'] = (-1) * (s['adj_log_t/sqrt_len'] - mn) / std
decay_par = 0.5
# Question: would it make sense and how would you adjust by user?
# Runs in approx. 65s
k_sorted = k.sort_values(by='s_id')
rs = 0; re = 0; ranges = {}
for idx, val in k_sorted['s_id'].value_counts().sort_index().iteritems():
re += val
ranges[idx] = (rs, re)
rs += val
# Runs in 5 min
for s_id, row in s.iterrows():
if ranges.get(s_id) != None:
t, v = ranges.get(s_id)
k_sentence = k_sorted.iloc[t:v, :]
else:
k_sentence = k_sorted.iloc[0:0, :]
weights = pd.Series(range(1, len(k_sentence) + 1)).rpow(decay_par)
weights[len(k_sentence) - 1] += weights[len(k_sentence) - 1]
sorted_k = k_sentence['log_t'].sort_values(ascending=False)
s.at[s_id, 'weighted_log_t'] = sorted_k.reset_index()['log_t'] @ weights
mean, std = s['weighted_log_t'].mean(), s['weighted_log_t'].std()
s['latency'] = (s['weighted_log_t'] - mean) / std
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
count.fit(f['text'])
goal_s = ['Learner corpora are electronic collections of natural written data of L2 learners, also called L2-Corpora.',
'Address advantages and disadvantages associated with each method.',
'It measures how compatible your data is with the null hypothesis.',
'The scoring is based on the use of numerical scales. ',
'What is meant by more complex and fluent speech?',
'The measure of central tendency which is most sensitive to data from outliers is the mean.']
goal_s = list(s.iloc[50:70]['text'])
# RUNS IN 2-5 MINS
goal_bag = [count.transform(np.array([g_s])) for g_s in goal_s]
goal_lists = [[] for g_s in goal_s]
for s_id, row in s.iterrows():
bag = count.transform(np.array([row['text']]))
for idx in range(0, len(goal_s)):
if scipy.spatial.distance.cosine(bag.todense(), goal_bag[idx].todense()) < 0.6:
goal_lists[idx].append((scipy.spatial.distance.cosine(bag.todense(), goal_bag[idx].todense()), s_id))
for idx in range(0, len(goal_s)):
goal_lists[idx] = [x[1] for x in sorted(goal_lists[idx])[:10]]
N = len(goal_s)
labels = goal_s
data = np.random.random((N, 4))
for i in range(0, len(goal_s)):
data[i, 3] = len(goal_lists[i])
data[i, 0] = s.iloc[goal_lists[i]]['my_f'].mean()
data[i, 1] = s.iloc[goal_lists[i]]['latency'].mean()
if len(labels[i]) > 20:
labels[i] = labels[i][:17] + '...'
fig, ax = plt.subplots()
fig.set_size_inches(8, 8)
ax.set(xlim=(-1, 1), ylim=(-1, 1))
ax.axhline(y=0, color='k')
ax.axvline(x=0, color='k')
plt.subplots_adjust(bottom = 0.1)
plt.scatter(
data[:, 0], data[:, 1], marker='o', c=data[:, 2], s=data[:, 3] * 10,
cmap=plt.get_cmap('Spectral'))
for label, x, y in zip(labels, data[:, 0], data[:, 1]):
plt.annotate(
label,
xy=(x, y), xytext=(-20, 20),
textcoords='offset points', ha='right', va='bottom',
bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
arrowprops=dict(arrowstyle = '->', connectionstyle='arc3,rad=0'),
size='xx-small',
)
plt.show()
# same thing
N = len(goal_s)
labels = goal_s
data = np.random.random((N, 4))
for i in range(0, len(goal_s)):
data[i, 3] = len(goal_lists[i])
data[i, 0] = s.iloc[goal_lists[i]]['my_f'].mean()
data[i, 1] = s.iloc[goal_lists[i]]['latency'].mean()
if len(labels[i]) > 20:
labels[i] = labels[i][:17] + '...'
fig, ax = plt.subplots()
fig.set_size_inches(8, 8)
ax.set(xlim=(-1, 1), ylim=(-1, 1))
ax.axhline(y=0, color='k')
ax.axvline(x=0, color='k')
plt.subplots_adjust(bottom = 0.1)
plt.scatter(
data[:, 0], data[:, 1], marker='o', c=data[:, 2], s=data[:, 3] * 100,
cmap=plt.get_cmap('Spectral'))
for label, x, y in zip(labels, data[:, 0], data[:, 1]):
plt.annotate(
label,
xy=(x, y), xytext=(-20, 20),
textcoords='offset points', ha='right', va='bottom',
bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
arrowprops=dict(arrowstyle = '->', connectionstyle='arc3,rad=0'))
plt.show()
def plot_fluency_latency(s1, s2):
fig = plt.figure()
ax1 = fig.add_subplot(111)
fig.set_size_inches(8, 8)
ax1.set(xlim=(-2.5, 2.5), ylim=(-2.5, 2.5))
ax1.scatter(s1['my_f'], s1['latency'],
s=10, c='b', marker="s", label='first', alpha=0.3)
ax1.scatter(s2['my_f'], s2['latency'],
s=10, c='r', marker="o", label='second', alpha=0.3)
plt.legend(loc='upper left');
plt.show()
print("Our data consists of ", len(k), " keystrokes")
print('Percentage of deletions: ', "{:.3f}".format(len(k[k['op'] == '-']) / len(k) * 100), "%")
Most common char is ' ', most common punctuation is '.', then ','. Most common uppercase is 'I'
print('Most typed characters:\n')
print('char freq speed deleted')
pcts = k['char'].value_counts() / len(k) * 100
for idx, val in pcts.head(30).iteritems():
ks = k[(k['char'] == idx)]['t'].apply(lambda x: min(x, 5000))
kdel = k[(k['char'] == idx)]['is_del']
print("{:>2}".format(idx), " ", "{:>6.2f}".format(val), "% ", "{:>6.1f}".format(ks.mean()),
" ", "{:>6.3f}".format(100 * kdel.sum() / len(kdel)), "%")
On a log scale:
print('log_t')
k['t'].mean()
k['t'].median()
plot_summary(k['t'])
In a small segment, it behaves like $c^x$
plot_summary(k[(k['t'] > 550) & (k['t'] < 5000)]['t'])
Here is why we chose to take log, distribution looks nicer
plot_summary(k['t'], pct=0.001, logscale=0)
The imperfection/spikes are due to the fact that etherpad only logs keystrokes every 0.5 second
So there's a bunch that happened at a distance of .5, .25 etc
plot_summary(k['log_t'])
Impact of adjusting by user
plot_summary(k['adj_log_t'])
For every sentence, we take the number of pauses longer than .5, 1, 2, 5, 10 seconds
plot_summary(s['p_510_no'], pct=0.0003, logscale=0)
On average, people take 5 breaks longer than 2 seconds per sentence
plot_summary(s['p_2000_no'], pct=0.0003, logscale=0)
The number of long breaks decreases approximately exponentially
plot_summary(s['p_10000_no'], pct=0, logscale=1)
len(f)
print("Our data consists of ", len(w), " words")
plot_summary(w['text_len'])
print("{:.3f}".format(100 * len(w[w['spell_err'] > 0]) / len(w)), "% of the words were misspelt")
print('{:>12}'.format("word"), '{:>8}'.format("count"), '{:>11}'.format("revised"), '{:>10}'.format(" time/len"), '{:>10}'.format(" longest p (mean of IQR)"), "\n")
for idx, val in w['text'].value_counts().head(30).iteritems():
wds = w[w['text'] == idx]
pm, pM = wds['max_p'].quantile(0.6), wds['max_p'].quantile(0.7)
print('{:>12}'.format(idx), '{:>8}'.format(val), " "
'{:>5.1f}'.format(100 * len(wds[wds['n_revisions'] > 0]) / val), "% ",
'{:>8.1f}'.format(wds['t/len_filter_10000'].mean()), " ",
'{:>8.1f}'.format(wds[(wds['max_p'] > pm) & (wds['max_p'] < pM)]['max_p'].mean()))
print("Our data consists of ", len(s), " sentences")
On average, sentences are abt 120 characters / 18 words long
plot_summary(s['text_len'])
plot_summary(s['word_count'])
And time spent is pretty much proportional to the length of the sentence
s[(s['text_len'] < 200) & (s['t_filter_10000'] < 130000)].plot.hexbin(x='text_len', y='t_filter_10000', gridsize=20)
We imported these metrics from CoCoGen, removing some who were duplicates/uninformative
print (['Syntactic.ClausesPerSentence_value',
'Syntactic.DependentClausesPerSentence_value',
'Syntactic.CoordinatePhrasesPerSentence_value',
'Syntactic.VerbPhrasesPerSentence_value',
'Syntactic.ComplexNominalsPerSentence_value',
'NounPhrasePreModificationWords_value',
'NounPhrasePostModificationWords_value',
'Lexical.Sophistication.NAWL_value',
'Lexical.Sophistication.NGSL_value', 'Lexical.Sophistication.AFL_value',
'Lexical.Sophistication.ANC_value', 'Lexical.Sophistication.BNC_value',
'Lexical.Density_value', 'Lexical.Diversity.NDW_value',
'Lexical.Diversity.CNDW_value', 'Lexical.Diversity.TTR_value',
'Lexical.Diversity.CTTR_value', 'Lexical.Diversity.RTTR_value',
'Morphological.MeanSyllablesPerWord_value',
'Morphological.MeanLengthWord_value', 'KolmogorovDeflate_value',
'Morphological.KolmogorovDeflate_value',
'Syntactic.KolmogorovDeflate_value'])
And normalized those who were linearly dependent on text length
So that they now represent stds from the mean of that metrics within the pool of sentences of that length
For example, here is Kolmogorov deflate, which pretty much was linear in text_len
s[(s['text_len'] < 150) & (s['t_filter_10000'] < 130000)].plot.hexbin(x='text_len', y='KolmogorovDeflate_value', gridsize=20)
And now does not depend on it
s[(s['text_len'] < 150) & (s['t_filter_10000'] < 130000)].plot.hexbin(x='text_len', y='n_Kolmogorov', gridsize=20)
These are the measures affected:
print([('KolmogorovDeflate_value', 'n_Kolmogorov'),
('Syntactic.KolmogorovDeflate_value', 'n_SKolmogorow'),
('Morphological.KolmogorovDeflate_value', 'n_MKolmogorow'),
('Lexical.Diversity.NDW_value', 'n_NDW_value'),
('Lexical.Diversity.RTTR_value', 'n_RTTR_value')])
We computed time spent and time/len for different tresholds (1s, 2s, 5s, 10s, 30s), setting pauses longer than the treshold to be as long as the treshold
This was necessary as if not outliers (like minute-long breaks, or even hour-long) corrputed statistiacl metrics
plot_summary(s['t_filter_10000'])
plot_summary(s['t/len_filter_10000'])
plot_summary(s['t/word_filter_10000'])
Please refer to section 3 for mathematical formulations
Fluency is an indicator of average typing speed. It doesn't excessively weight very long pauses, and does not depend on length. Comparing two sentences written in the same time, the one written at a constant pace is more fluent that the one written in bursts and long breaks
plot_summary(s['fluency'])
Latency is an indicator of the time taken in the longest pauses during the sentence. A sentence written all at once after one big pause would have high latency, one written in bursts with medium-length breaks would have medium latency, and one at a constant pace will have low latency.
Longer sentences tend to have slightly higher latency because they tend to contain longer pauses overall
plot_summary(s['latency'])
Fluency and latency tend to be negatively correlated, because they both depend on the overall time spent, but more time is lower fluency & higher latency
cff = s['fluency'].corr(s['latency'])
cff
If you do a x-y plot of fluency vs latency you can identify different sentences types
fit = np.poly1d(np.polyfit(s['latency'], s['fluency'], 1))
s['my_f'] = s['fluency'] - fit[1] * s['latency']
s.head(400).plot.scatter(x='my_f', y='latency')
We divided into time spent on word, and on separators
It is not clear yet whether a bigger fraction of time spent on separators indicates more syntactically complex sentences, or simply more difficult words
Around 28% of the time is spent on separators
plot_summary(s['separator_t/t_filter_10000'])
plot_summary(s['word_t/t_filter_10000'])
Here is the avg time spent revising the sentence, which should be very indicative of complexity
plot_summary(s['revision_t/len_filter_10000'], logscale=0)
plot_summary(s['del_t_filter_10000/t'], logscale=0)
A jump is defined to be whenever the user moves to a non-consecutive point of the final sentence
A new chunk is defined to be whenever the user moves to a different sentence, and then comes back to the one he's now writing
Both are related with the amount of revisions
plot_summary(s['jumps'], pct=0, logscale=0)
plot_summary(s['chunks'], pct=0, logscale=0)
For a given sentence length and for a fixed user, single-clause sentences take longer to write ($p = 2 \cdot 10^{-9}$)
This is the opposite of what one might expect. But actually, it's easier to form a 'simple' sentence of a given length by splitting it up over multiple clauses, rather than writing one extremely long clause.
In particular, single-clause sentences have both higher latency and higher fluency, indicating they are more likely to be written all at once rather than in blocks at a constant pace with breaks
If we remove the dependency on sentence length, multi-clause sentences, have higher latency, indicating a larger block of time devoted to thinking about sentence structure (but still higher fluency)
As far as secondary measures go, multi-claused sentences have a higher proportion of time spent revising and on separators.
One might think that during the revision process, single-claused sentences are "improved" into multi-claused sentences. But in fact, the opposite happens.
The difference in number of jumps and chunks very strongly suggest that by revising after the sentence is completed for the first time, writers are turning multi claused sentences into single claused. This is because, again, single claused sentences tend to be more elaborate.
It is unclear if writers happen to go through the same process in the middle of writing a sentence as well.
(Note: removing the filter for equal-length sentences make multi-clause sentences appear to be more complex in all aspects, but this is likely related to the fact that multi-clause sentences are ~51 chars longer on avg)
Comparing coordinate vs dependent clauses, there is no appreciable difference in fluency, latency, or typing speed.
However, subordinate sentences exibit a much bigger number of >2s pauses. This likely means that it's easier to write a coordinative 'and' without stopping to think, whereas it is more difficult to introduce a dependent clause.
Subordinate clause also tend to have bigger revision, separator, and jump (but not chunks), likely signifing increased linguistic complexity.
def isSingleClause(row):
return (row['Syntactic.ClausesPerSentence_value'] <= 1)
def isMultiClause(row):
return (row['Syntactic.ClausesPerSentence_value'] > 1)
plot_distr(['fluency', 'latency', 't/len_filter_10000', 'p_2000_no'],
by_user=True, by_length=True, cr1=isSingleClause, cr2=isMultiClause)
plot_distr(['fluency', 'latency', 't/len_filter_10000', 'p_2000_no'],
by_user=True, by_length=False, cr1=isSingleClause, cr2=isMultiClause)
plot_distr(['revision_t/t_filter_10000', 'separator_t/t_filter_10000', 'jumps', 'chunks'],
by_user=True, by_length=True, cr1=isSingleClause, cr2=isMultiClause)
plot_distr(['text_len','revision_t/t_filter_10000', 'separator_t/t_filter_10000', 'jumps', 'chunks'],
by_user=True, by_length=False, cr1=isSingleClause, cr2=isMultiClause)
def isOnlyCoord(row):
return (row['Syntactic.CoordinatePhrasesPerSentence_value'] > 0) and \
(row['Syntactic.DependentClausesPerSentence_value'] == 0)
def isOnlySub(row):
return (row['Syntactic.DependentClausesPerSentence_value'] > 0) and \
(row['Syntactic.CoordinatePhrasesPerSentence_value'] == 0)
plot_distr(['fluency', 'latency', 'p_2000_no', 'revision_t/t_filter_10000', 'separator_t/t_filter_10000', 'jumps', 'chunks'],
by_user=True, by_length=True, cr1=isOnlyCoord, cr2=isOnlySub)
Sentences with low/high ANC Sophistication show no appreciable differences in fluency or latency or typing speed.
However, sentences with more sophisticated lexicon show a much greater number of pauses >2s (implying one often needs to stop and think before an unusual word), and a lower revision % (implying people do not often go back and replace unusual words once they put the effort to write them in the first place)
The higher number of jumps and chunks suggests that people often do go back to replace easier words with sophisticated ones, however.
When evaluated on the BNC discrepancies are lower, which suggests that the population sample is more familiar with uncommon words from the BNC rather than the ANC
Other sophistication measures were not correlated with anything except:
high (Words not on General Service list) -> low fluency (p=0.0008)
PreModificationWords | PostModificationWords | Lexical.Density | NDW | RTTR
_____________________|_______________________|_________________|______________|_______________
| | | |
low fluency | LOW fluency | | high fluen. | high fluen.
high-ish latency | | high-ish laten. | low latency |
high revision_t | | low revision_t | high rev_t | high rev_t
| HIGH separator_t | low separator_t | high sep_t | high sep_t
| HIGH jumps/chunks | high jumps/chks | low jumps/c |
| | | |
plot_distr(['fluency', 'latency', 'p_2000_no', 'revision_t/t_filter_10000', 'separator_t/t_filter_10000', 'jumps', 'chunks'],
by_user=True, by_length=True, split_cr='Lexical.Sophistication.ANC_value')
plot_distr(['fluency', 'latency', 'p_2000_no', 'revision_t/t_filter_10000', 'separator_t/t_filter_10000', 'jumps', 'chunks'],
by_user=True, by_length=True, split_cr='Lexical.Sophistication.BNC_value')
plot_distr(['fluency', 'latency', 'p_2000_no', 'revision_t/t_filter_10000', 'separator_t/t_filter_10000', 'jumps', 'chunks'],
by_user=True, by_length=True, split_cr='Lexical.Sophistication.NAWL_value')
plot_distr(['fluency', 'latency', 'p_2000_no', 'revision_t/t_filter_10000', 'separator_t/t_filter_10000', 'jumps', 'chunks'],
by_user=True, by_length=True, split_cr='Lexical.Sophistication.NGSL_value')
PreModificationWords | PostModificationWords | Lexical.Density | NDW | RTTR
_____________________|_______________________|_________________|______________|_______________
| | | |
low fluency | LOW fluency | | high fluen. | high fluen.
high-ish latency | | high-ish laten. | low latency |
LOW revision_t | | low revision_t | high rev_t | high rev_t
| HIGH separator_t | LOW separator_t | HIGH sep_t | HIGH sep_t
| HIGH jumps/chunks | HIGH jumps/chks | low jumps/c |
| | | |
low/high-ish = 0.01 < p < 0.05
low/high = 0.0001 < p < 0.01
LOW/HIGH = p < 0.0001
metrics CNDW, TTR, CTTR were not statistically significant in interesting ways
relevant_metrics = [
'NounPhrasePreModificationWords_value',
'NounPhrasePostModificationWords_value',
'Lexical.Density_value',
'n_NDW_value',
'n_RTTR_value']
other_metrics = [
'Lexical.Diversity.CNDW_value',
'Lexical.Diversity.TTR_value',
'Lexical.Diversity.CTTR_value']
for m in relevant_metrics:
plot_distr(['fluency', 'latency', 'p_2000_no', 'revision_t/t_filter_10000', 'separator_t/t_filter_10000', 'jumps', 'chunks'],
by_user=True, by_length=True, split_cr=m)
for m in other_metrics:
plot_distr(['fluency', 'latency', 'p_2000_no', 'revision_t/t_filter_10000', 'separator_t/t_filter_10000', 'jumps', 'chunks'],
by_user=True, by_length=True, split_cr=m)
memo: High Kolmogorov deflate = SIMPLE (because more easily compressed)
Kolmogorov deflate is an excellent predictor of just about any keystroke and linguistic complexity measure. Namely, High K. Deflate correlates (p < $10^{10}$) with:
Whos better at predicting what? (everything statistically significant to several orders of magnitude)
note: they are both generally very good predictors (they are very correlated, so...), but sometimes one if much better than the other
Syntactic K. | Morphological K.
__________________|_____________________
|
fluency | latency
time, t/len, no_p | jumps
del_count | separator/word_t
revision_t/t |
__________________|_____________________
|
| dependentClauses
| pre/post mod words
| Sophistication
| Diversity
|
plot_distr(['fluency', 'latency', 't/len_filter_10000'],
by_user=True, by_length=True, split_cr='n_Kolmogorov')
plot_distr(['del_count', 'revision_t/t_filter_10000', 'separator_t/t_filter_10000', 'jumps', 'chunks'],
by_user=True, by_length=True, split_cr='n_Kolmogorov')
plot_distr(['word_count', 'Lexical.Density_value', 'Syntactic.ClausesPerSentence_value', 'Lexical.Sophistication.ANC_value'],
by_user=True, by_length=True, split_cr='n_Kolmogorov')
plot_distr(my_s_col,
by_user=True, by_length=True, split_cr='n_SKolmogorov')
plot_distr(my_s_col,
by_user=True, by_length=True, split_cr='n_MKolmogorov')
As expected,
high fluency = FAST typing = simple language =>
Latency is kind of the same, except that for e.g. Sophistication metrics fluency is a much better predictor, and latency does not have impact on morphological complexity
plot_distr(['word_t/t_filter_10000', 'del_count', 'revision_t/t_filter_10000'],
by_user=False, by_length=True, split_cr='fluency')
def highJ(row):
return row['jumps'] > 0
def lowJ(row):
return row['jumps'] <= 0
plot_distr(['fluency', 'latency', 'p_2000_no', 'revision_t/t_filter_10000', 'separator_t/t_filter_10000', 'jumps', 'chunks'],
by_user=True, by_length=True, cr1=highJ, cr2=lowJ)
def highC(row):
return row['chunks'] > 1
def lowC(row):
return row['chunks'] <= 1
plot_distr(['fluency', 'latency', 'p_2000_no', 'revision_t/t_filter_10000', 'separator_t/t_filter_10000', 'jumps', 'chunks'],
by_user=True, by_length=True, cr1=highC, cr2=lowC)
k.to_csv('../../pads/keystrokes.csv', index=False)
w.to_csv('../../pads/words.csv', index=False)
s.to_csv('../../pads/sentences.csv', index=False)
f.to_csv('../../pads/files.csv', index=False)
u.to_csv('../../pads/users.csv', index=False)
s = pd.read_csv('../../pads/sentences.csv', index_col=False)
for idx, row in f.iterrows():
file = open("../../frontend/files/" + row['path'].replace('/', "-") + "!" + str(idx) + "!" + str(row['s_id_s']) + "!" + str(row['s_id_e']) + ".txt", "w")
file.write(row['text'])
file.close()
text = s.groupby('path')['text'].apply(lambda x: ' '.join(x))
st_id = s.groupby('path')['id'].min()
end_id = s.groupby('path')['id'].max()
for i, (t, sid, eid) in enumerate(zip(text,st_id,end_id)):
file = open("../../frontend/files/" + str(i) + "!" + str(sid) + "!" + str(eid) + ".txt", "w")
file.write(t)
file.close()
# I/O to file for cocogen
s = pd.read_csv('../../pads/old_sentences' + suffix + '.csv', index_col=False)
len(s[s['path'] == 'SS18/e69d4/10'])
for idx, row in s[s['path'] == 'SS18/e69d4/10'].iterrows():
file = open("../../frontend/prova.txt", "a")
file.write(row['text'])
file.write("\n\n")
file.close()
for idx, row in s.tail(1000).iterrows():
file = open("../../frontend/tette2.txt", "a")
file.write(row['text'])
file.close()
for idx, row in s.iterrows():
file = open("../../frontend/sentences/" + str(idx) + ".txt", "w")
file.write(row['text'])
file.close()
f.head(50)
for idx, row in s.head(1442).iterrows():
file = open("../../frontend/tette_new_s.txt", "a")
file.write(row['text'])
file.close()
for idx, row in f.head(50).iterrows():
file = open("../../frontend/tette_new.txt", "a")
file.write(row['text'])
file.close()
for idx, row in f.iterrows():
file = open("../../frontend/files/" + row['path'].replace('/', "-") + "!" + str(idx) + "!" + str(row['s_id_s']) + "!" + str(row['s_id_e']) + ".txt", "w")
file.write(row['text'])
file.close()
a
for u_id, row in u.iterrows():
u_s = s[s['u_id'] == u_id]
u.at[u_id, 'jumps/s'] = u_s['jumps'].sum() / len(u_s)
u.at[u_id, 'chunks/s'] = u_s['chunks'].sum() / len(u_s)
u.at[u_id, 'latency/s'] = u_s['latency'].mean()
u.at[u_id, 'fluency/s'] = u_s['fluency'].mean()
u.at[u_id, 'n_SKolmogorov'] = u_s['n_SKolmogorov'].mean()
u.at[u_id, 'n_MKolmogorov'] = u_s['n_MKolmogorov'].mean()
u.at[u_id, 'M.MeanSyllablesPerWord'] = u_s['Morphological.MeanSyllablesPerWord_value'].mean()
u.at[u_id, 'S.DependentC'] = u_s['Syntactic.DependentClausesPerSentence_value'].mean()
u.at[u_id, 'L.Density'] = u_s['Lexical.Density_value'].mean()
u.at[u_id, 'n_NDW_value'] = u_s['n_NDW_value'].mean()
valid_u = u[u['k_count'] > 10000]
k_ms = ['log_t_mean', 'p_510_no/len', 'p_2000_no/len', 'p_10000_no/len', 'latency/s', 'fluency/s',
'revision_t/len_filter_10000_mean', 'del/len_mean',
'jumps/s', 'chunks/s', 'separator_t/t_filter_10000_mean']
l_ms = ['text_len_mean', 'n_SKolmogorov', 'n_MKolmogorov', 'M.MeanSyllablesPerWord', 'S.DependentC', 'L.Density', 'n_NDW_value']
#u_mean = [u[col].mean() for col in u_ms]
#u_std = [u[col].mean() for col in u_ms]
#
#adj_ms = ['adj_' + ms for ms in u_ms]
#
#for ms, mean, std in zip(u_ms, u_mean, u_std):
# u['adj_' + ms] = (u[ms] - mean) / std
from mpl_toolkits.mplot3d import Axes3D
def plot_3d(x_col, y_col, z_col, color_col=None, df=valid_u):
global k, w, s, f, u, valid_u
fig = plt.figure(figsize=(12, 10))
ax = fig.add_subplot(111, projection='3d')
plt.figure(figsize=(2, 2))
cm = plt.cm.get_cmap('RdYlBu')
if color_col is None:
sc = ax.scatter(df[x_col], df[y_col], df[z_col],
s=15, cmap=cm)
else:
sc = ax.scatter(df[x_col], df[y_col], df[z_col],
c=df[color_col], vmin=df[color_col].quantile(0.05), vmax=df[color_col].quantile(0.95),
s=15, cmap=cm)
fig.colorbar(sc)
ax.set_xlabel(x_col)
ax.set_ylabel(y_col)
ax.set_zlabel(z_col)
plt.show()
plot_3d('log_t_mean', 'jumps/s', 'separator_t/t_filter_10000_mean', color_col='text_len_mean')
X = valid_u[k_ms]
X.head(2)
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(X)
from sklearn.cluster import KMeans
sc = []
for n_c in range(1, 20):
k_means = KMeans(n_clusters=n_c)
k_means.fit(X)
sc.append((-1)* k_means.score(X))
plt.plot(sc)
k_means = KMeans(n_clusters=4, random_state=13)
k_means.fit(X)
valid_u.loc[:, 'kmeans_label'] = k_means.labels_
for idx, row in valid_u.iterrows():
u.at[idx, 'kmeans_label'] = row['kmeans_label']
plot_3d('p_2000_no/len', 'del/len_mean', 'revision_t/len_filter_10000_mean', color_col='kmeans_label')
from sklearn.cluster import MeanShift
m_shift = MeanShift().fit(X)
m_shift.cluster_centers_
valid_u.loc[:, 'mshift_label'] = m_shift.labels_
for m in l_ms:
print("{:>35}".format(m), end=' ')
for l in range(0, 4):
sub = valid_u[valid_u['kmeans_label'] == l]
print("{:>6.3f}".format(sub[m].mean()), end=' ')
print(' -- ssd: ', "{:>6.3f}".format(valid_u[m].std() / np.sqrt(len(valid_u) / 4) * 2))
def label1(row):
return row['kmeans_label'] == 1
def label2(row):
return row['kmeans_label'] == 2
plot_distr(l_ms, cr1=label1, cr2=label2, by_user=False, by_length=False, tol=1, df=valid_u)
plot_3d('n_NDW_value', 'S.DependentC', 'L.Density', color_col='kmeans_label')
ks_ms = ['latency', 'fluency', 't_filter_2000', 't_filter_10000',
'p_510_no', 'p_2000_no', 'p_10000_no',
'jumps', 'chunks', 'revision_t/t_filter_10000', 'revision_t/len_filter_10000',
'separator_t/t_filter_10000', 'word_t/t_filter_10000']
ls_ms = ['Syntactic.ClausesPerSentence_value', 'Syntactic.DependentClausesPerSentence_value', 'Syntactic.CoordinatePhrasesPerSentence_value', 'n_SKolmogorov',
'Lexical.Sophistication.BNC_value', 'Lexical.Sophistication.ANC_value', 'Lexical.Sophistication.NGSL_value','n_MKolmogorov',
'Lexical.Density_value', 'n_NDW_value', 'n_RTTR_value',
'n_Kolmogorov', ]
heat_map = []
log_map = []
log_log_map = []
for l_m in ls_ms:
l = plot_distr(ks_ms, split_cr=l_m, pct=0.2, \
by_user=True, by_length=True, tol=1, p_only=True)
heat_map.append(l)
log_map.append([(-1) * np.log(v) for v in l])
log_log_map.append([ np.log((-1) * np.log(v)) for v in l])
def norm(a):
n_rows, n_cols = a.shape
for i in range(0, n_rows):
cm = a[i].mean()
for j in range(0, n_cols):
rm = 0
for k in range(0, n_rows):
rm += a[k][j] / n_cols
a[i][j] -= (rm + cm) / 2
return a
ks_ms_lb = ['latency', 'fluency', 't_filter_2000', 't_filter_10000',
'p_510_no', 'p_2000_no', 'p_10000_no',
'jumps', 'chunks', 'revision_t/t', 'revision_t/len',
'separator_t/t', 'word_t/t']
sns.heatmap(norm(np.array(log_log_map)), linewidths=.5,
xticklabels=ks_ms_lb, yticklabels=ls_ms)
def train_eval(models,
features=[(s[['text_len']], 'text len')],
targets=[(s['Syntactic.ClausesPerSentence_value'], 'clauses per sentence')],
verbose=1):
if not isinstance(models, list):
models = [models]
avg_r2 = [0 for _ in range(0, len(features))]
for y, t_desc in targets:
if verbose > 0:
print ("\n Targets: " + t_desc + "\n")
for i, (X, f_desc) in enumerate(features):
if verbose > 0:
print ("Features: " + "{:<45}".format(f_desc), end='')
ts = time.time(); accuracies = []
X_tr, X_ts, y_tr, y_ts = model_selection.train_test_split(X, y, test_size=0.2, random_state=1)
for m_d in models:
if 'desc' not in m_d:
m_d['desc'] = 'unknown m '
for kw in 'bfp':
if kw+'_args' not in m_d:
m_d[kw+'_args'] = {}
if isinstance(m_d['m'], base.BaseEstimator):
m = m_d['m']
else:
# Model has to be built
m = m_d['m'](input_s=X_tr.shape[1], **m_d['b_args'])
ts = time.time()
m.fit(X_tr, y_tr, **m_d['f_args'])
if verbose > 0:
print (" M: " + m_d['desc'] + " # ", end='')
y_pred = m.predict(X_ts, **m_d['p_args'])
y_tr_pred = m.predict(X_tr, **m_d['p_args'])
#print ("P:", y_pred, end='')
adj_R2 = 1 - (1 - metrics.r2_score(y_ts, y_pred)) * (X.shape[0]-1) / (X.shape[0]-X.shape[1]-1)
avg_r2[i] += metrics.r2_score(y_ts, y_pred)
if verbose == 1:
print ('adj test R2: ' + "{0:.2f}".format(100 * adj_R2), end='%\n' )
if verbose == 2:
print (" test R2: " + "{0:.2f}".format(100 * metrics.r2_score(y_ts, y_pred)), end='%' )
print (" - tr R2: " + "{0:.2f}".format(100 * metrics.r2_score(y_tr, y_tr_pred)), end='%' )
print (" T: " + "{0:.1f}".format(time.time() - ts) + "s")
if verbose == 0:
for i, (X, f_desc) in enumerate(features):
print ("Features: " + "{:<45}".format(f_desc), end='')
print (" Avg R2 : " + "{0:.2f}".format(100 * avg_r2[i] / len(targets)), end='%\n')
train_eval([{'m': ensemble.HistGradientBoostingRegressor(max_iter=200, learning_rate=0.15, l2_regularization=40000), 'desc': "histGB reg"}],
features=[
(s[['t/len_filter_10000']], 't/len filter 10000'),
(s[['fluency', 'latency']], 'fluency + latency'),
(s[['revision_t/len_filter_10000', 'k/len']], 'revision_t/len_filter_10000'),
(s[['word_t/t_filter_10000', 'separator_t/t_filter_10000']], 'word_t/t_filter_10000'),
(s[['max_pos_pct'] + ['n_' + str(i) + '_fraction' for i in range(0,10)]], 'positional information only'),
(s[['t/len_filter_10000', 'revision_t/len_filter_10000',
'word_t/t_filter_10000', 'k/len'] + ['max_pos_pct'] + ['n_' + str(i) + '_fraction' for i in range(0,10)]], 'all of above, length filtered'),
(s[['jumps', 'chunks']], 'jumps and chunks'),
(s[['text_len']], 'length only'),
(s[list(s.columns[37:])], 'all keystr metrics'),
(pd.DataFrame(np.random.normal(0,100,size=(len(s), 5))), 'random noise')],
targets=[(s[name], name) for name in list(s.columns[8:31])],
verbose=0)
train_eval([{'m': ensemble.HistGradientBoostingRegressor(max_iter=500, learning_rate=0.15, l2_regularization=10000), 'desc': "histGB reg"}],
features=[(s[['text_len']], 'length only'),
(s[['t/len_filter_10000', 'max_pos_pct', 'revision_t/len_filter_10000',
'word_t/t_filter_10000', 'k/len']], 'misc keystroke metrics, length filtered'),
(s[['fluency', 'latency']], 'fluency + latency'),
(s[['max_pos_pct'] + ['n_' + str(i) + '_fraction' for i in range(0,10)]], 'positional information only'),
(s[[
't_filter_10000', 'p_510_no', 'p_1000_no',
'p_2000_no', 'p_5000_no', 'p_10000_no', 'max_t', 'max_pos', 'max_pos_pct',
't/len_filter_10000', 'fluency', 'latency', 't_first_2', 't_first_5', 't_first_20',
't/len_0_fraction', 't/len_1_fraction', 't/len_2_fraction',
't/len_3_fraction', 't/len_4_fraction', 't/len_5_fraction',
't/len_6_fraction', 't/len_7_fraction', 't/len_8_fraction',
't/len_9_fraction', 'del_t_filter_10000', 'del_t/t_filter_10000', 'jumps', 'chunks',
'revision_t/t_filter_10000', 'revision_t/len_filter_10000',
'word_t/t_filter_10000', 'separator_t/t_filter_10000']], 'keystr metrics'),
(pd.DataFrame(np.random.normal(0,100,size=(len(s), 5))), 'random noise')],
targets=[(s['Syntactic.ClausesPerSentence_value'], 'no of clauses'),
(s['Syntactic.ComplexNominalsPerSentence_value'], 'no of complex nominals'),
(s['Lexical.Sophistication.BNC_value'], 'sophistication BNC'),
(s['Lexical.Density_value'], 'density'),
(s['Lexical.Diversity.CNDW_value'], 'diversity'),
(s['KolmogorovDeflate_value'], 'non-corrected deflate'),
(s['n_Kolmogorov'], 'corrected deflate')])
train_eval([{'m': ensemble.HistGradientBoostingRegressor(max_iter=50, learning_rate=0.15, l2_regularization=40000), 'desc': "histGB reg"}],
features=[
(s[['Syntactic.DependentClausesPerSentence_value']], 'Syntactic.DependentClausesPerSentence_value'),
(s[['KolmogorovDeflate_value']], 'n_Kolmogorov'),
(s[['Lexical.Density_value']], 'Lexical.Density_value'),
(s[list(s.columns[8:31])], 'all lingusitic metrics'),
(pd.DataFrame(np.random.normal(0,100,size=(len(s), 5))), 'random noise')],
targets=[(s[name], name) for name in ['fluency', 'latency']],
verbose=0)
s['my'] = (s['Morphological.MeanLengthWord_value'] - s['Morphological.MeanLengthWord_value'].mean()) / s['Morphological.MeanLengthWord_value'].std()
(np.random.normal(size=10000) - np.random.normal(size=10000)).std()
# The fact that this is lower than random (sqrt(2), as above)
# proves that complexity is local in nature
(s['my'] - s.shift(1)['my']).std()
l = []
for w in range(2, 100):
avg = 0
for j in range(1, w):
avg += (s['my'] - s.shift(j)['my']).std()
avg /= w - 1
l.append(avg)
plt.plot(l)
m = 'Morphological.MeanLengthWord_value'
windows = sorted([n * (10**m) for n in [1, 2, 3, 4, 6, 8] for m in [0, 1, 2, 3]][1:])
for ws in windows:
s[m + str(ws)] = 0
for w in range(1, ws):
s[m + str(ws)] += s[m].shift(w)
s[m + str(ws)] /= ws - 1
s[m + str(ws)].fillna(s[m + str(ws)].mean(), inplace=True)
train_eval([{'m': ensemble.HistGradientBoostingRegressor(max_iter=50, learning_rate=0.15, l2_regularization=40000), 'desc': "histGB reg"}],
features=[
(s[[m]], m)] +\
[(s[[m, m+str(n)]], m + str(n)) for n in windows],
targets=[(s[name], name) for name in ['t_filter_10000']],
verbose=0)
# Runs in approx. 50s
k_sorted = k.sort_values(by='s_id')
rs = 0; re = 0; ranges = {}
for idx, val in k_sorted['s_id'].value_counts().sort_index().iteritems():
re += val
ranges[idx] = (rs, re)
rs += val
## Runs in 40s
ft = []
for s_id, row in s.iterrows():
if ranges.get(s_id) != None:
t, v = ranges.get(s_id)
k_sentence = k_sorted.iloc[t:v, :]
else:
k_sentence = k_sorted.iloc[0:0, :]
ks_list = np.zeros(500)
if len(k_sentence) > 500:
k_sentence = k_sentence.sample(n=500).sort_index()
ks_list[:int(len(k_sentence))] = k_sentence['adj_log_t'].values
ft.append(ks_list)
adj_log_t_sequences = np.array(ft)
n_Kolm = s['n_Kolmogorov'].values
adj_log_t_sequences.shape
time_par = 3 # 1 = 40s a modello, 5min in totale
nonNN_models = [{'m': linear_model.LinearRegression(), 'desc': "LinearReg."},
{'m': ensemble.RandomForestRegressor(n_estimators=10*time_par), 'desc': "RForest "},
{'m': ensemble.GradientBoostingRegressor(n_estimators=30*time_par), 'desc': "GrBoostR "},
{'m': ensemble.HistGradientBoostingRegressor(max_iter=200*time_par), 'desc': "HistBoostR"},
{'m': neural_network.MLPRegressor(hidden_layer_sizes=(170, 170,), activation='relu', max_iter=4*time_par), 'desc': "2-layer NN"},
{'m': neural_network.MLPRegressor(hidden_layer_sizes=(60, 60, 60, 60, 60,), activation='relu', max_iter=6*time_par), 'desc': "5-layer NN"},
]
train_eval(nonNN_models)
train_eval({'m': neural_network.MLPRegressor(hidden_layer_sizes=(100, 100, 100, 100, 100,),
activation='tanh', max_iter=100), 'desc': "5-layer NN"})
def seq_model(input_s, layers_no=4, layers_s=256,
activation='tanh', optimizer=keras.optimizers.Nadam(), dropout=False, **kwargs):
model = Sequential()
# Dense(64) is a fully-connected layer with 64 hidden units.
# in the first layer, you must specify the expected input data shape:
# here, 20-dimensional vectors.
model.add(Dense(layers_s, activation=activation, input_dim=input_s, **kwargs))
for _ in range(0, layers_no-1):
if dropout:
model.add(Dropout(0.5))
model.add(Dense(layers_s, activation=activation, **kwargs))
model.add(Dense(1, activation='relu', **kwargs))
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mean_absolute_error'])
return model
my_pca = decomposition.PCA(n_components=20)
my_pca.fit(adj_log_t_sequences)
my_pca.explained_variance_ratio_